{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读入数据，给定各字段的名字\n",
    "df = pd.read_csv('../data/douban.csv', header=None,\n",
    "                 names=['p_name', 'p_url', 'c_date_time', 'c_data', 'c_rank', 'c_recom'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>p_name</th>\n",
       "      <th>p_url</th>\n",
       "      <th>c_date_time</th>\n",
       "      <th>c_data</th>\n",
       "      <th>c_rank</th>\n",
       "      <th>c_recom</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>王大根</td>\n",
       "      <td>https://www.douban.com/people/diewithme/</td>\n",
       "      <td>2018-01-19 18:17:25</td>\n",
       "      <td>在这种家庭里做一条狗都好啊\\n</td>\n",
       "      <td>力荐</td>\n",
       "      <td>6463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>李阿斗</td>\n",
       "      <td>https://www.douban.com/people/gailsylee/</td>\n",
       "      <td>2017-11-25 02:12:27</td>\n",
       "      <td>当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...</td>\n",
       "      <td>力荐</td>\n",
       "      <td>3429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>光明小卖部</td>\n",
       "      <td>https://www.douban.com/people/gooooooooooohe/</td>\n",
       "      <td>2017-12-06 15:10:45</td>\n",
       "      <td>所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n</td>\n",
       "      <td>力荐</td>\n",
       "      <td>3349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>同志亦凡人中文站</td>\n",
       "      <td>https://www.douban.com/people/3540441/</td>\n",
       "      <td>2017-11-24 15:57:52</td>\n",
       "      <td>有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...</td>\n",
       "      <td>推荐</td>\n",
       "      <td>1814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>桃桃淘电影</td>\n",
       "      <td>https://www.douban.com/people/qijiuzhiyue/</td>\n",
       "      <td>2018-01-19 14:18:28</td>\n",
       "      <td>其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...</td>\n",
       "      <td>推荐</td>\n",
       "      <td>1711</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     p_name                                          p_url  \\\n",
       "0       王大根       https://www.douban.com/people/diewithme/   \n",
       "1       李阿斗       https://www.douban.com/people/gailsylee/   \n",
       "2     光明小卖部  https://www.douban.com/people/gooooooooooohe/   \n",
       "3  同志亦凡人中文站         https://www.douban.com/people/3540441/   \n",
       "4     桃桃淘电影     https://www.douban.com/people/qijiuzhiyue/   \n",
       "\n",
       "           c_date_time                                             c_data  \\\n",
       "0  2018-01-19 18:17:25                            在这种家庭里做一条狗都好啊\\n           \n",
       "1  2017-11-25 02:12:27   当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...   \n",
       "2  2017-12-06 15:10:45            所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n           \n",
       "3  2017-11-24 15:57:52   有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...   \n",
       "4  2018-01-19 14:18:28   其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...   \n",
       "\n",
       "  c_rank  c_recom  \n",
       "0     力荐     6463  \n",
       "1     力荐     3429  \n",
       "2     力荐     3349  \n",
       "3     推荐     1814  \n",
       "4     推荐     1711  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 预览数据\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "p_name         0\n",
      "p_url          0\n",
      "c_date_time    0\n",
      "c_data         0\n",
      "c_rank         0\n",
      "c_recom        0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# 缺失值检测与去除\n",
    "print(df.isnull().sum())\n",
    "#df.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 拆分原c_date_time为c_date和c_time\n",
    "def get_date(date_time):\n",
    "    # 有时会格式不对\n",
    "    if len(date_time) < 10:\n",
    "        return None\n",
    "    return re.findall(r'(\\d+-\\d+-\\d+) \\d+.*?', date_time)[0]\n",
    "\n",
    "\n",
    "def get_time(date_time):\n",
    "    if len(date_time) < 10:\n",
    "        return None\n",
    "    return re.findall(r'.*? (\\d+:\\d+:\\d+)', date_time)[0]\n",
    "\n",
    "\n",
    "df['c_date'] = df['c_date_time'].apply(get_date)\n",
    "df['c_time'] = df['c_date_time'].apply(get_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>p_name</th>\n",
       "      <th>p_url</th>\n",
       "      <th>c_date_time</th>\n",
       "      <th>c_data</th>\n",
       "      <th>c_rank</th>\n",
       "      <th>c_recom</th>\n",
       "      <th>c_date</th>\n",
       "      <th>c_time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>王大根</td>\n",
       "      <td>https://www.douban.com/people/diewithme/</td>\n",
       "      <td>2018-01-19 18:17:25</td>\n",
       "      <td>在这种家庭里做一条狗都好啊\\n</td>\n",
       "      <td>力荐</td>\n",
       "      <td>6463</td>\n",
       "      <td>2018-01-19</td>\n",
       "      <td>18:17:25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>李阿斗</td>\n",
       "      <td>https://www.douban.com/people/gailsylee/</td>\n",
       "      <td>2017-11-25 02:12:27</td>\n",
       "      <td>当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...</td>\n",
       "      <td>力荐</td>\n",
       "      <td>3429</td>\n",
       "      <td>2017-11-25</td>\n",
       "      <td>02:12:27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>光明小卖部</td>\n",
       "      <td>https://www.douban.com/people/gooooooooooohe/</td>\n",
       "      <td>2017-12-06 15:10:45</td>\n",
       "      <td>所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n</td>\n",
       "      <td>力荐</td>\n",
       "      <td>3349</td>\n",
       "      <td>2017-12-06</td>\n",
       "      <td>15:10:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>同志亦凡人中文站</td>\n",
       "      <td>https://www.douban.com/people/3540441/</td>\n",
       "      <td>2017-11-24 15:57:52</td>\n",
       "      <td>有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...</td>\n",
       "      <td>推荐</td>\n",
       "      <td>1814</td>\n",
       "      <td>2017-11-24</td>\n",
       "      <td>15:57:52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>桃桃淘电影</td>\n",
       "      <td>https://www.douban.com/people/qijiuzhiyue/</td>\n",
       "      <td>2018-01-19 14:18:28</td>\n",
       "      <td>其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...</td>\n",
       "      <td>推荐</td>\n",
       "      <td>1711</td>\n",
       "      <td>2018-01-19</td>\n",
       "      <td>14:18:28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     p_name                                          p_url  \\\n",
       "0       王大根       https://www.douban.com/people/diewithme/   \n",
       "1       李阿斗       https://www.douban.com/people/gailsylee/   \n",
       "2     光明小卖部  https://www.douban.com/people/gooooooooooohe/   \n",
       "3  同志亦凡人中文站         https://www.douban.com/people/3540441/   \n",
       "4     桃桃淘电影     https://www.douban.com/people/qijiuzhiyue/   \n",
       "\n",
       "           c_date_time                                             c_data  \\\n",
       "0  2018-01-19 18:17:25                            在这种家庭里做一条狗都好啊\\n           \n",
       "1  2017-11-25 02:12:27   当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...   \n",
       "2  2017-12-06 15:10:45            所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n           \n",
       "3  2017-11-24 15:57:52   有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...   \n",
       "4  2018-01-19 14:18:28   其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...   \n",
       "\n",
       "  c_rank  c_recom      c_date    c_time  \n",
       "0     力荐     6463  2018-01-19  18:17:25  \n",
       "1     力荐     3429  2017-11-25  02:12:27  \n",
       "2     力荐     3349  2017-12-06  15:10:45  \n",
       "3     推荐     1814  2017-11-24  15:57:52  \n",
       "4     推荐     1711  2018-01-19  14:18:28  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 预览数据\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before->\n",
      " p_name         object\n",
      "p_url          object\n",
      "c_date_time    object\n",
      "c_data         object\n",
      "c_rank         object\n",
      "c_recom         int64\n",
      "c_date         object\n",
      "c_time         object\n",
      "dtype: object\n",
      "After->\n",
      " p_name                 object\n",
      "p_url                  object\n",
      "c_date_time    datetime64[ns]\n",
      "c_data                 object\n",
      "c_rank                 object\n",
      "c_recom                 int64\n",
      "c_date                 object\n",
      "c_time                 object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "# 如果需要，也可以进行数据类型的转换\n",
    "print('Before->\\n', df.dtypes)\n",
    "df['c_date_time'] = df['c_date_time'].astype('datetime64[ns]')\n",
    "print('After->\\n',  df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 也可方便地进行数据转换[Encoding Categorical Values]\n",
    "# 将汉字对应编码为数字\n",
    "def trans(words):\n",
    "    if words == '力荐':\n",
    "        return 5\n",
    "    elif words == '推荐':\n",
    "        return 4\n",
    "    elif words == '还行':\n",
    "        return 3\n",
    "    elif words == '较差':\n",
    "        return 2\n",
    "    elif words == '很差':\n",
    "        return 1\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "\n",
    "df['c_rank_num'] = df['c_rank'].apply(trans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>p_name</th>\n",
       "      <th>p_url</th>\n",
       "      <th>c_date_time</th>\n",
       "      <th>c_data</th>\n",
       "      <th>c_rank</th>\n",
       "      <th>c_recom</th>\n",
       "      <th>c_date</th>\n",
       "      <th>c_time</th>\n",
       "      <th>c_rank_num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>王大根</td>\n",
       "      <td>https://www.douban.com/people/diewithme/</td>\n",
       "      <td>2018-01-19 18:17:25</td>\n",
       "      <td>在这种家庭里做一条狗都好啊\\n</td>\n",
       "      <td>力荐</td>\n",
       "      <td>6463</td>\n",
       "      <td>2018-01-19</td>\n",
       "      <td>18:17:25</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>李阿斗</td>\n",
       "      <td>https://www.douban.com/people/gailsylee/</td>\n",
       "      <td>2017-11-25 02:12:27</td>\n",
       "      <td>当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...</td>\n",
       "      <td>力荐</td>\n",
       "      <td>3429</td>\n",
       "      <td>2017-11-25</td>\n",
       "      <td>02:12:27</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>光明小卖部</td>\n",
       "      <td>https://www.douban.com/people/gooooooooooohe/</td>\n",
       "      <td>2017-12-06 15:10:45</td>\n",
       "      <td>所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n</td>\n",
       "      <td>力荐</td>\n",
       "      <td>3349</td>\n",
       "      <td>2017-12-06</td>\n",
       "      <td>15:10:45</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>同志亦凡人中文站</td>\n",
       "      <td>https://www.douban.com/people/3540441/</td>\n",
       "      <td>2017-11-24 15:57:52</td>\n",
       "      <td>有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...</td>\n",
       "      <td>推荐</td>\n",
       "      <td>1814</td>\n",
       "      <td>2017-11-24</td>\n",
       "      <td>15:57:52</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>桃桃淘电影</td>\n",
       "      <td>https://www.douban.com/people/qijiuzhiyue/</td>\n",
       "      <td>2018-01-19 14:18:28</td>\n",
       "      <td>其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...</td>\n",
       "      <td>推荐</td>\n",
       "      <td>1711</td>\n",
       "      <td>2018-01-19</td>\n",
       "      <td>14:18:28</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     p_name                                          p_url  \\\n",
       "0       王大根       https://www.douban.com/people/diewithme/   \n",
       "1       李阿斗       https://www.douban.com/people/gailsylee/   \n",
       "2     光明小卖部  https://www.douban.com/people/gooooooooooohe/   \n",
       "3  同志亦凡人中文站         https://www.douban.com/people/3540441/   \n",
       "4     桃桃淘电影     https://www.douban.com/people/qijiuzhiyue/   \n",
       "\n",
       "          c_date_time                                             c_data  \\\n",
       "0 2018-01-19 18:17:25                            在这种家庭里做一条狗都好啊\\n           \n",
       "1 2017-11-25 02:12:27   当时出国以后的第一最大感受就是尊重，不论老弱病孕还是任何“与众不同”，都不会有人上下打量你...   \n",
       "2 2017-12-06 15:10:45            所有人都知道是化妆只有我一个人以为请的真实病人出演吗。。。\\n           \n",
       "3 2017-11-24 15:57:52   有种糖放多了的感觉，精华基本都在预告里了。但对孩子们的纯真友情毫无抵抗力啊，就像被温柔的抚...   \n",
       "4 2018-01-19 14:18:28   其实这更像当代童话，因为，实在是太暖了。里面每个人都那么暖，怎么可以那么暖，怎么可以那么暖...   \n",
       "\n",
       "  c_rank  c_recom      c_date    c_time  c_rank_num  \n",
       "0     力荐     6463  2018-01-19  18:17:25           5  \n",
       "1     力荐     3429  2017-11-25  02:12:27           5  \n",
       "2     力荐     3349  2017-12-06  15:10:45           5  \n",
       "3     推荐     1814  2017-11-24  15:57:52           4  \n",
       "4     推荐     1711  2018-01-19  14:18:28           4  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 预览数据\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置索引列为c_date_time\n",
    "df.index = df['c_date_time']\n",
    "\n",
    "# 去除多余的c_date_time列\n",
    "df = df.drop(['c_date_time'], axis=1)\n",
    "\n",
    "# 其他的一些操作..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去除操作产生的缺失值\n",
    "df.dropna(inplace=True)\n",
    "# 保存预处理后的文件\n",
    "df.to_csv('douban_processed.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
